Para replicar las secciones de esta clase, primero debe descargar el
siguiente proyecto
de R y abrir el archivo clase-13.Rproj.
#### **Instalar/llamar las librerías de la clase**
require(pacman)
p_load(tidyverse,rio,glue,
hexbin,
patchwork,vip, ## plot:
ggrepel, ## plot: geom_text_repel
stringi,tidytext,stopwords, ## text-data
tidymodels,finetune) db <- import("input/house_prices.rds") %>% select(-price) %>%
mutate(description=str_to_lower(description),
description=stri_trans_general(str = description , id = "Latin-ASCII")) ## limpiar caracteres especiales
## change outcome
db <- db %>% mutate(price = parse_number(gsub(",","-",price_q))) %>% select(-price_q) #This drops any non-numeric characters before or after the first number.
## count prices
db %>% count(price) %>% arrange(-n) %>% head(20)## price n
## 1 966.0 1609
## 2 330.0 1600
## 3 17.8 1561
## 4 2000.0 1543
## 5 635.0 1541
## 6 1580.0 1529
## 7 783.0 1523
## 8 3000.0 1517
## 9 1250.0 1455
## 10 480.0 1432
## density plot
db %>%
ggplot(aes(lon, lat, z = price)) +
stat_summary_hex(alpha = 0.8, bins = 50) +
scale_fill_viridis_c(option = "A") + theme_light()## Generar una columna con los tokens, aplanando el tibble en un-token por fila.
db_tidy <- db %>%
unnest_tokens(output = word, input = description) %>%
anti_join(get_stopwords("es"),"word")
db_tidy %>% count(word, sort = TRUE) %>% head(20)## word n
## 1 apartamento 10990
## 2 zona 10330
## 3 sala 8125
## 4 cocina 7685
## 5 bano 7161
## 6 dos 6807
## 7 cuenta 6759
## 8 comedor 6737
## 9 2 6222
## 10 edificio 6140
## 11 bao 6134
## 12 piso 5485
## 13 excelente 5270
## 14 terraza 5163
## 15 social 5125
## 16 3 4832
## 17 alcobas 4743
## 18 vista 4615
## 19 parqueaderos 4433
## 20 habitaciones 4087
## n palabras con mayor frecuencia absoluta
top_words <- db_tidy %>%
count(word, sort = TRUE) %>%
filter(!word %in% as.character(0:10)) %>%
slice_max(n, n = 100) %>%
pull(word)
top_words## [1] "apartamento" "zona" "sala" "cocina"
## [5] "bano" "dos" "cuenta" "comedor"
## [9] "edificio" "bao" "piso" "excelente"
## [13] "terraza" "social" "alcobas" "vista"
## [17] "parqueaderos" "habitaciones" "cuarto" "servicio"
## [21] "venta" "estudio" "ubicado" "acceso"
## [25] "tres" "cerca" "chimenea" "pisos"
## [29] "principal" "deposito" "exterior" "privado"
## [33] "oficina" "independiente" "garajes" "espacios"
## [37] "gimnasio" "banos" "area" "parqueadero"
## [41] "ascensor" "salon" "visitantes" "sector"
## [45] "comunal" "abierta" "parque" "chico"
## [49] "natural" "amplia" "calle" "independientes"
## [53] "lavanderia" "ciudad" "espectacular" "closet"
## [57] "alcoba" "madera" "baos" "zonas"
## [61] "integral" "m2" "amplio" "vende"
## [65] "cada" "mas" "bogota" "acabados"
## [69] "vigilancia" "mts" "24" "vias"
## [73] "gas" "ubicacion" "seguridad" "iluminacion"
## [77] "bbq" "planta" "amplios" "comercial"
## [81] "balcon" "carrera" "remodelado" "ropas"
## [85] "gran" "privada" "oficinas" "iluminado"
## [89] "saln" "nivel" "chapinero" "exclusivo"
## [93] "rentahouse" "excelentes" "hermoso" "oportunidad"
## [97] "comerciales" "hall" "horas" "norte"
## [101] "rosales"
subset(db,id=="DgJXrP7JZuQS7trOVPj4xg==")$description## [1] "espacio - soluciones inmobiliarias ofrece en venta este espectacular apartamneto en el exclusivo sector de el refugio.el apartamento ubicado en el piso 12 del edificio construido hace un ao cuenta con una excelente vista y por lo tanto una espectacular iluminacin natural.el inmueble cuenta con un rea de 85m2 distribuidos asi: zona social con balcn y chimenea cocina abierta tipo americano con zona de ropas baos social habitacin principal con wlak-in closet y bao privado y la habitacin secudaria con bao privado.el apartamento cuenta con puerta de sueguridad ventaneratermoacstica dos parqueaderos y un depsito.el edificio tiene lobby gimnasio totalmente dotado vigilancia privada 24/7 y terraza con parque infantil y con nosotros para brindarte ms informacin.am (197)"
## contar conmibanciones de word y price
count_words <- db_tidy %>%
count(word, price) %>%
complete(word, price, fill = list(n = 0)) ## expandir todas las posibles combinaciones
count_words %>% head(20)## # A tibble: 20 × 3
## word price n
## <chr> <dbl> <int>
## 1 _7 17.8 1
## 2 _7 330 0
## 3 _7 480 1
## 4 _7 635 0
## 5 _7 783 0
## 6 _7 966 0
## 7 _7 1250 0
## 8 _7 1580 1
## 9 _7 2000 0
## 10 _7 3000 0
## 11 _arrienda 17.8 0
## 12 _arrienda 330 0
## 13 _arrienda 480 0
## 14 _arrienda 635 0
## 15 _arrienda 783 0
## 16 _arrienda 966 0
## 17 _arrienda 1250 0
## 18 _arrienda 1580 0
## 19 _arrienda 2000 0
## 20 _arrienda 3000 1
## obtener la frecuencia relativa word-price
word_freqs <- count_words %>%
group_by(price) %>%
mutate(price_sum = sum(n),
proportion = n / price_sum) %>%
ungroup() %>%
filter(word %in% top_words)
word_freqs## # A tibble: 1,010 × 5
## word price n price_sum proportion
## <chr> <dbl> <int> <int> <dbl>
## 1 24 17.8 267 55294 0.00483
## 2 24 330 299 60276 0.00496
## 3 24 480 201 57873 0.00347
## 4 24 635 191 61729 0.00309
## 5 24 783 160 59439 0.00269
## 6 24 966 196 66184 0.00296
## 7 24 1250 136 62379 0.00218
## 8 24 1580 119 65235 0.00182
## 9 24 2000 133 68475 0.00194
## 10 24 3000 93 69611 0.00134
## # … with 1,000 more rows
## Identificar palabras que aumentan o disminuyen la frecuencia con el precio.
word_model <- word_freqs %>%
nest(data = c(price, n, price_sum, proportion)) %>%
mutate(model = map(.x = data,
.f = ~ glm(cbind(n, price_sum) ~ price , data = . , family = "binomial")),
model = map(model, tidy)) %>%
unnest(model) %>%
filter(term == "price") %>%
mutate(p.value = p.adjust(p.value)) %>%
arrange(-estimate)
word_model## # A tibble: 101 × 7
## word data term estimate std.error statistic p.value
## <chr> <list> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 cada <tibble [10 × 4]> price 0.000422 0.0000237 17.8 4.13e-69
## 2 cuarto <tibble [10 × 4]> price 0.000357 0.0000169 21.1 7.37e-97
## 3 espectacular <tibble [10 × 4]> price 0.000342 0.0000228 15.0 5.74e-49
## 4 servicio <tibble [10 × 4]> price 0.000338 0.0000171 19.8 5.58e-85
## 5 exclusivo <tibble [10 × 4]> price 0.000331 0.0000292 11.3 8.27e-28
## 6 mas <tibble [10 × 4]> price 0.000295 0.0000243 12.2 3.91e-32
## 7 planta <tibble [10 × 4]> price 0.000267 0.0000267 10.0 9.39e-22
## 8 oficinas <tibble [10 × 4]> price 0.000249 0.0000288 8.64 3.59e-16
## 9 acabados <tibble [10 × 4]> price 0.000243 0.0000248 9.79 9.26e-21
## 10 gimnasio <tibble [10 × 4]> price 0.000242 0.0000207 11.7 1.15e-29
## # … with 91 more rows
## plot estimate vs p-value
word_model %>%
ggplot(aes(estimate, p.value)) +
geom_vline(xintercept = 0, lty = 2, alpha = 0.7, color = "gray50") +
geom_point(color = "midnightblue", alpha = 0.8, size = 2.5) +
scale_y_log10() +
geom_text_repel(aes(label = word))## Warning: ggrepel: 78 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## generar vector con palabras que mas aumentan o disminuyen con el precio
higher_words <- word_model %>%
filter(p.value < 0.05) %>%
slice_max(estimate, n = 12) %>%
pull(word)
lower_words <- word_model %>%
filter(p.value < 0.05) %>%
slice_max(-estimate, n = 12) %>%
pull(word)
## plot lower_words
word_freqs %>%
filter(word %in% lower_words) %>%
ggplot(aes(price, proportion, color = word)) +
geom_line(size = 2.5, alpha = 0.7, show.legend = FALSE) +
facet_wrap(vars(word), scales = "free_y") + theme_light()## plot higher_words
word_freqs %>%
filter(word %in% higher_words) %>%
ggplot(aes(price, proportion, color = word)) +
geom_line(size = 2.5, alpha = 0.7, show.legend = FALSE) +
facet_wrap(vars(word), scales = "free_y") + theme_light()## set train-test dataset
set.seed(123)
db_split <- db %>% mutate(price=as.factor(price)) %>%
initial_split(strata = price)
db_train <- training(db_split)
db_test <- testing(db_split)
## set n-folds
set.seed(234)
db_folds <- vfold_cv(data=db_train, v=5 , strata=price)
db_folds## # 5-fold cross-validation using stratification
## # A tibble: 5 × 2
## splits id
## <list> <chr>
## 1 <split [9182/2298]> Fold1
## 2 <split [9183/2297]> Fold2
## 3 <split [9184/2296]> Fold3
## 4 <split [9184/2296]> Fold4
## 5 <split [9187/2293]> Fold5
## set metrics
db_metrics <- metric_set(accuracy, roc_auc, mn_log_loss)
## generar pattern para expresion regular
higher_pattern <- glue_collapse(higher_words, sep = "|")
lower_pattern <- glue_collapse(lower_words, sep = "|")
## prepare db
browseURL("https://cran.r-project.org/web/packages/recipes/recipes.pdf")
db_recipe <- recipe(formula=price ~ . , data=db_train) %>% ## En recip se detallan los pasos que se aplicarán a un conjunto de datos para prepararlo para el análisis de datos.
update_role(id, new_role = "id") %>% ## cambiar role para variable id
step_regex(description, pattern=higher_pattern , result="high_price_words") %>% ## generar dummy
step_regex(description, pattern=lower_pattern , result="low_price_words") %>% ## generar dummy
step_rm(description) %>% ## remover description
step_novel(property_type) %>%
#step_unknown(homeType) %>%
#step_other(homeType, threshold = 0.02) %>%
step_dummy(all_nominal_predictors()) %>%
step_nzv(all_predictors())
db_recipe## Recipe
##
## Inputs:
##
## role #variables
## id 1
## outcome 1
## predictor 12
##
## Operations:
##
## Regular expression dummy variable using "cada|cuarto|espectacular|servicio|exclusivo|ma...
## Regular expression dummy variable using "chapinero|parqueadero|horas|calle|24|vigilanci...
## Variables removed description
## Novel factor level assignment for property_type
## Dummy variables from all_nominal_predictors()
## Sparse, unbalanced variable filter on all_predictors()
## Boosted Tree Model Specification
xgb_spec <- boost_tree(trees = 1000,
tree_depth = tune(),
min_n = tune(),
mtry = tune(),
sample_size = tune(),
learn_rate = tune()) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_spec## Boosted Tree Model Specification (classification)
##
## Main Arguments:
## mtry = tune()
## trees = 1000
## min_n = tune()
## tree_depth = tune()
## learn_rate = tune()
## sample_size = tune()
##
## Computational engine: xgboost
## workflow
xgb_word_wf <- workflow(db_recipe, xgb_spec)
## tunne hiperparametros
xgb_grid <- grid_max_entropy(tree_depth(c(5L, 10L)),
min_n(c(10L, 40L)),
mtry(c(5L, 10L)),
sample_prop(c(0.5, 1.0)),
learn_rate(c(-2, -1)),
size = 20)
xgb_grid## # A tibble: 20 × 5
## tree_depth min_n mtry sample_size learn_rate
## <int> <int> <int> <dbl> <dbl>
## 1 6 28 6 0.891 0.0466
## 2 6 14 6 0.952 0.0126
## 3 6 27 9 0.932 0.0660
## 4 5 36 10 0.501 0.0868
## 5 10 36 8 0.587 0.0228
## 6 9 37 9 0.856 0.0371
## 7 7 32 10 0.581 0.0420
## 8 8 26 5 0.998 0.0600
## 9 10 15 8 0.902 0.0887
## 10 7 27 6 0.690 0.0613
## 11 6 39 9 0.783 0.0306
## 12 9 20 5 0.991 0.0142
## 13 8 14 10 0.946 0.0115
## 14 5 17 5 0.778 0.0234
## 15 6 30 5 0.628 0.0108
## 16 8 19 10 0.669 0.0228
## 17 6 13 7 0.771 0.0582
## 18 5 18 7 0.999 0.0802
## 19 9 17 6 0.587 0.0195
## 20 7 26 8 0.687 0.0205
## estimate model
xgb_word_rs <- tune_race_anova(object = xgb_word_wf,
resamples = db_folds,
grid = xgb_grid,
metrics = metric_set(mn_log_loss),
control = control_race(verbose_elim = T))xgb_word_rs## # Tuning results
## # 5-fold cross-validation using stratification
## # A tibble: 5 × 5
## splits id .order .metrics .notes
## <list> <chr> <int> <list> <list>
## 1 <split [9182/2298]> Fold1 3 <tibble [20 × 9]> <tibble [20 × 3]>
## 2 <split [9183/2297]> Fold2 1 <tibble [20 × 9]> <tibble [20 × 3]>
## 3 <split [9184/2296]> Fold3 2 <tibble [20 × 9]> <tibble [20 × 3]>
## 4 <split [9184/2296]> Fold4 5 <tibble [1 × 9]> <tibble [1 × 3]>
## 5 <split [9187/2293]> Fold5 4 <tibble [1 × 9]> <tibble [0 × 3]>
##
## There were issues with some computations:
##
## - Warning(s) x20: There are new levels in a factor: La SalleThere are new levels in... - Warning(s) x20: There are new levels in a factor: La SalleThere are new levels in... - Warning(s) x1: There are new levels in a factor: La SalleThere are new levels in... - Warning(s) x20: There are new levels in a factor: La SalleThere are new levels in...
##
## Run `show_notes(.Last.tune.result)` for more information.
## plot model
plot_race(xgb_word_rs)## best model
show_best(xgb_word_rs)## # A tibble: 1 × 11
## mtry min_n tree_depth learn_rate sample_size .metric .estimator mean n
## <int> <int> <int> <dbl> <dbl> <chr> <chr> <dbl> <int>
## 1 8 15 10 0.0887 0.902 mn_log_l… multiclass 1.20 5
## # … with 2 more variables: std_err <dbl>, .config <chr>
## xgboost model
xgb_last <- xgb_word_wf %>%
finalize_workflow(select_best(xgb_word_rs, "mn_log_loss")) %>%
last_fit(db_split)## Warning: package 'xgboost' was built under R version 4.1.2
## ! train/test split: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Modelo Norte
xgb_last## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [11480/3830]> train/test spl… <tibble> <tibble> <tibble> <workflow>
##
## There were issues with some computations:
##
## - Warning(s) x1: There are new levels in a factor: Modelo Norte
##
## Run `show_notes(.Last.tune.result)` for more information.
## min log loss
collect_predictions(xgb_last) %>%
mn_log_loss(price, `.pred_17.8`:`.pred_3000`)## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mn_log_loss multiclass 1.11
## predictons vs truht value
predictions <- collect_predictions(xgb_last) %>%
conf_mat(price, .pred_class)
predictions## Truth
## Prediction 17.8 330 480 635 783 966 1250 1580 2000 3000
## 17.8 289 42 11 7 4 8 4 5 1 9
## 330 53 271 57 14 9 7 2 2 3 2
## 480 10 37 193 46 17 6 7 3 3 1
## 635 7 24 64 215 44 23 13 6 4 5
## 783 6 6 22 49 218 37 15 13 2 2
## 966 4 2 12 23 39 224 51 28 7 6
## 1250 2 2 0 7 17 38 210 33 32 7
## 1580 5 6 3 3 5 14 38 251 37 20
## 2000 6 3 4 8 9 20 28 41 242 45
## 3000 10 5 7 5 3 7 7 22 44 290
predictions %>%
autoplot() + theme_light()## ROC curve
collect_predictions(xgb_last) %>%
roc_curve(price, `.pred_17.8`:`.pred_3000`) %>%
ggplot(aes(1 - specificity, sensitivity, color = .level)) +
geom_abline(lty = 2, color = "gray80", size = 1.5) +
geom_path(alpha = 0.8, size = 1.2) +
coord_equal() + theme_light()## Var importance
extract_workflow(xgb_last) %>%
extract_fit_parsnip() %>%
vip(geom = "point", num_features = 15) + theme_light()